import pandas as pd
import numpy as np
import os
print(os.listdir("../input"))
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sb
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode,iplot
import plotly.figure_factory as ff
init_notebook_mode(connected=True)
import re
# Any results you write to the current directory are saved as output.
m1 = pd.read_csv('../input/movies/tmdb_5000_credits.csv')
m2 = pd.read_csv('../input/movies/tmdb_5000_movies.csv')
m1.shape , m2.shape
m1.head(2)
pd.read_json(m1['crew'][0]).head(2)
pd.read_json(m1['cast'][0]).head(2)
def extractnum(x):
try:
return len(pd.read_json(x))
except:
return 0
def extractdirector(x):
try:
df = pd.read_json(x)
return df[df['department'] == 'Directing']['name'].unique()[0]
except:
return 'Not found'
def extractlead(x):
try:
df = pd.read_json(x)
return df['name'][0]
except:
return 'Not found'
m1['Director'] = m1['crew'].apply(extractdirector)
m1['total_crew'] = m1['crew'].apply(extractnum)
m1['total_cast'] = m1['cast'].apply(extractnum)
m1['lead_actor'] = m1['cast'].apply(extractlead)
m1.head(2)
m11 = m1.drop(['cast','crew'],axis =1)
m11.head(2)
# CLEANING M2
m2.head(2)
#GENRE EXTRACTION
d = pd.read_json(m2['genres'][0])
d
from functools import reduce
df = pd.read_json(m2['keywords'][0])
reduce(lambda x,y: x+' '+y ,list(df['name']))
def extractgenre(x):
try:
df = pd.read_json(x)
return reduce(lambda x,y: x+' '+y ,list(df['name']))
except:
return 'None'
gen = pd.DataFrame()
gen['genre'] = m2['genres'].apply(extractgenre)
### COUNT VECTORIZER TO CREATE A TFM OF GENRES
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words= 'english')
genbw = cv.fit_transform(gen['genre'])
genre = pd.DataFrame(genbw.toarray(),columns=cv.get_feature_names())
genre.head(2)
pd.read_json(m2['production_companies'][0]).head(2)
pd.read_json(m2['production_countries'][0]).head(2)
def extractpc(x):
try:
df = pd.read_json(x)
return df['name'][0]
except:
return 'None'
m2['pro_company'] = m2['production_companies'].apply(extractpc)
m2['pro_country'] = m2['production_countries'].apply(extractpc)
### ROPPING ALL THE UNNECESSARY COLUMNS
m21 = m2.drop(['genres','homepage','keywords','production_countries','overview','production_companies','spoken_languages','tagline','title'],axis =1)
m21.head(2)
## MERGING THE DATAFRAMES
movies = pd.merge(m11,m21,how = 'inner' , left_on='movie_id',right_on='id')
movies.head(2)
## REMOVING ID COLUMN AS ITS REDUNDANT AND CONVERTING RELEASE DATE TO DATE
movies['release_date'] = pd.to_datetime(movies['release_date']).dt.date
movies.drop(['id'],axis =1 ,inplace=True)
movies['status'].value_counts()
movies1 = movies[(movies['status'] == 'Released') & (movies['budget'] > 0) & (movies['revenue'] >0)]
movies1.head(2)
### SIMPLE FUNTIONS TO IMPLEMENT PLOTLY GRAPHS
def boxtrace(df=None, col_name=None, boxpoints='outliers', boxmean=True):
return go.Box(y=df[col_name],name=col_name,boxpoints = boxpoints, boxmean=boxmean)
def pietrace(df, col_name=None):
tmp = df[col_name].value_counts()
return go.Pie(labels=list(tmp.index), values=list(tmp.values))
def violintrace(df=None, x_col=None, y_col=None, name=None):
if not x_col:
return go.Violin(y=df[y_col], box={"visible": True}, meanline={"visible": True}, name=name)
return go.Violin(x=df[x_col], y=df[y_col], box={"visible": True}, meanline={"visible": True}, name=name)
def distplot(df=None, col_names=[], show_hist=False):
data = [df[x].fillna(-1) for x in col_names]
return ff.create_distplot(data, col_names, show_hist=show_hist)
def bartrace(df=None, x_col=None, y_col=None, name=None):
return go.Bar(
y=df[y_col],
x=df[x_col],
name=name
)
def scattertrace(df=None, x_col=None, y_col=None, hover_col=None):
return go.Scatter(
y = df[y_col],
x = df[x_col],
hovertext= df[hover_col],
mode='markers',
marker=dict(
size=16,
color = np.random.randn(500), #set color equal to a variable
colorscale='Viridis',
showscale=True
)
)
## COMPOSITION OF MOVIES GENERATED BY COUNTRIES
df =pd.DataFrame(movies['pro_country'].value_counts()[0:10])
df = df.drop(['None'],axis =0)
iplot([go.Pie(labels=list(df.index), values=list(df.pro_country))])
df = movies1.groupby('pro_country')[['budget','revenue']].sum().sort_values('revenue',ascending =False).head(5)
df
data = [go.Bar(x=df.index,y=df['budget'],name='BUDGET'),\
go.Bar(x=df.index,y=df['revenue'],name='REVENUE')]
layout = go.Layout(
barmode='group'
)
iplot(go.Figure(data=data, layout=layout), filename='grouped-bar')
df =pd.DataFrame(movies['pro_company'].value_counts()[0:10])
df = df.drop(['None'],axis =0)
iplot([go.Pie(labels=list(df.index), values=list(df.pro_company))])
iplot([boxtrace(df=movies1, col_name="revenue"), boxtrace(df=movies1, col_name="budget")])
iplot([violintrace(df=movies1, y_col="budget"),violintrace(df=movies1, y_col="revenue")])
iplot([scattertrace(movies1,x_col='budget',y_col = 'revenue',hover_col='title')])
## PRODUCTION COMPANT WICE BUDGETS COMPARISON
data=[violintrace(df=movies1[movies1["pro_company"] == "Paramount Pictures"], y_col="budget", name="Paramount Pictures"),
violintrace(df=movies1[movies1["pro_company"] == "Universal Pictures"], y_col="budget", name="Universal Pictures"),
violintrace(df=movies1[movies1["pro_company"] == "Columbia Pictures"], y_col="budget", name="Columbia Pictures"),
violintrace(df=movies1[movies1["pro_company"] == "Twentieth Century Fox Film Corporation"], y_col="budget", name="Twentieth Century Fox Film Corporation")]
layout = {
"title": "BUDGETS COMAPARISION",
"yaxis": {
"zeroline": False,
},
"violinmode": "group"
}
fig = go.Figure(data=data, layout=layout)
iplot(fig)
data=[violintrace(df=movies1[movies1["pro_company"] == "Paramount Pictures"], y_col="revenue", name="Paramount Pictures"),
violintrace(df=movies1[movies1["pro_company"] == "Universal Pictures"], y_col="revenue", name="Universal Pictures"),
violintrace(df=movies1[movies1["pro_company"] == "Columbia Pictures"], y_col="revenue", name="Columbia Pictures"),
violintrace(df=movies1[movies1["pro_company"] == "Twentieth Century Fox Film Corporation"], y_col="revenue", name="Twentieth Century Fox Film Corporation")]
layout = {
"title": "REVENUES COMAPARISION",
"yaxis": {
"zeroline": False,
},
"violinmode": "group"
}
fig = go.Figure(data=data, layout=layout)
iplot(fig)
movies.head(1)
pp = movies1.pivot(index='popularity', columns='budget', values='revenue').fillna(0).values.tolist()
iplot([go.Surface(z=pp)])
##MOST POPULAR MOVIES
df = movies.sort_values('popularity',ascending=False).head(10)
iplot([go.Bar(y=df['popularity'],x=df['title'], name='TOP 10 POPULAR MOVIES')])
movies2 = movies1[movies1['budget'] >1000000]
movies1['profit_mve'] = (movies2['revenue']-movies2['budget'])/movies2['budget']
df = movies1.sort_values('profit_mve',ascending =False).head(10)[['title','profit_mve']]
iplot([go.Bar(y=df['profit_mve'],x=df['title'], name='TOP 10 PROFIT GENERATED MOVIES')])
iplot([scattertrace(movies1,x_col='vote_count',y_col = 'vote_average',hover_col='title')])
df = movies1.sort_values('release_date')
data = [go.Scatter( x = df['release_date'],y = df['revenue'],mode = 'lines+markers',name = 'REVENUE'),
go.Scatter(x = df['release_date'],y = df['budget'],mode = 'lines',name = 'BUDGET')]
iplot(data, filename='line-mode')
iplot([scattertrace(movies1,x_col='total_crew',y_col = 'total_cast',hover_col='title')])
m3 = pd.read_csv('../input/movies/movies_data.csv')
m3.head(2)
m3.actor_1_name.value_counts()[0:5]
m3.iloc[:,4:].apply(np.sum,axis =0).sort_values(ascending = False).head(6).index
df = m3.groupby('actor_1_name')[['Drama', 'Comedy', 'Thriller', 'Action', 'Romance', 'Adventure']].sum().reset_index()
df.head(2)
x = df[df["actor_1_name"] == "Jennifer Aniston"]
y = df[df["actor_1_name"] == "Brad Pitt"]
data = [go.Scatterpolar(
r = [x['Drama'].values[0],x['Comedy'].values[0],x['Thriller'].values[0],x['Action'].values[0],x['Romance'].values[0],x['Adventure'].values[0]],
theta = ['Drama', 'Comedy', 'Thriller', 'Action', 'Romance', 'Adventure'],
fill = 'toself',
name=x["actor_1_name"].values[0]
),
go.Scatterpolar(
r = [y['Drama'].values[0],y['Comedy'].values[0],y['Thriller'].values[0],y['Action'].values[0],y['Romance'].values[0],y['Adventure'].values[0]],
theta = ['Drama', 'Comedy', 'Thriller', 'Action', 'Romance', 'Adventure'],
fill = 'toself',
name=y["actor_1_name"].values[0]
)]
layout = go.Layout(
polar = dict(
radialaxis = dict(
visible = True,
)
),
showlegend = True,
)
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename = "ACTOR CAREER STATS")